package com.self.test.webmagic;

import java.util.List;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

public class GithubRepoPageProcessor implements PageProcessor {
	private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

	@Override
	public void process(Page page) {
		// 部分二：定义如何抽取页面信息，并保存下来
        List<String> urls = page.getHtml().css("div.pagination").links().all();

        // 部分三：从页面发现后续的url地址来抓取
        page.addTargetRequests(urls);
	}

	@Override
	public Site getSite() {
		return site;
	}

	public static void main(String[] args) {
		String url = "https://github.com/search?l=Java&p=1&q=stars%3A%3E1&s=stars&type=Repositories";
		Spider.create(new GithubRepoPageProcessor())
			//从"https://github.com/code4craft"开始抓
			.addUrl(url)
			//开启5个线程抓取
			.thread(5)
			//启动爬虫
			.run();
	}
}
